home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Tricks of the Mac Game Programming Gurus
/
TricksOfTheMacGameProgrammingGurus.iso
/
Information
/
CSMP Digest
/
volume 1
/
csmp-v1-225.txt
< prev
next >
Encoding:
Amiga
Atari
Commodore
DOS
FM Towns/JPY
Macintosh
Macintosh JP
NeXTSTEP
RISC OS/Acorn
UTF-8
Wrap
Text File
|
1994-12-08
|
71.9 KB
|
1,966 lines
|
[
TEXT/R*ch
]
C.S.M.P. Digest Mon, 21 Dec 92 Volume 1 : Issue 225
Today's Topics:
Help! making an assembly routine faster
The Comp.Sys.Mac.Programmer Digest is moderated by Michael A. Kelly.
The digest is a collection of article threads from the internet newsgroup
comp.sys.mac.programmer. It is designed for people who read c.s.m.p. semi-
regularly and want an archive of the discussions. If you don't know what a
newsgroup is, you probably don't have access to it. Ask your systems
administrator(s) for details. You can post articles to any newsgroup by
mailing your article to newsgroup@ucbvax.berkeley.edu. So, to post an
article to comp.sys.mac.programmer, you mail it to
comp-sys-mac-programmer@ucbvax.berkeley.edu. Note the '-' instead of '.'
in the newsgroup name.
Each issue of the digest contains one or more sets of articles (called
threads), with each set corresponding to a 'discussion' of a particular
subject. The articles are not edited; all articles included in this digest
are in their original posted form (as received by our news server at
cs.uoregon.edu). Article threads are not added to the digest until the last
article added to the thread is at least one month old (this is to ensure that
the thread is dead before adding it to the digest). Article threads that
consist of only one message are generally not included in the digest.
The entire digest is available for anonymous ftp from ftp.cs.uoregon.edu
[128.223.8.8] in the directory /pub/mac/csmp-digest. Be sure to read the
file /pub/mac/csmp-digest/README before downloading any files. The most
recent issues are available from sumex-aim.stanford.edu [36.44.0.6] in the
directory /info-mac/digest/csmp. If you don't have ftp capability, the sumex
archive has a mail server; send a message with the text '$MACarch help' (no
quotes) to LISTSERV@ricevm1.rice.edu for more information.
The digest is also available via email. Just send a note saying that you
want to be on the digest mailing list to mkelly@cs.uoregon.edu, and you will
automatically receive each new issue as it is created. Sorry, back issues
are not available through the mailing list.
Send administrative mail to mkelly@cs.uoregon.edu.
-------------------------------------------------------
From: mkelly@mystix.cs.uoregon.edu (Michael A. Kelly)
Subject: Help! making an assembly routine faster
Organization: High Risk Ventures
Date: Sat, 14 Nov 1992 09:19:05 GMT
Hey, all you assembly hackers! How can I make this routine faster? As it
is it's only about 40% faster than CopyMask. (I'm using Think C 5.)
/*
* Quick8CopyMask
*
* The QuickXCopyMask family are much faster versions of CopyMask
* that don't do clipping, dithering, etc. The source and destination
* PixMaps are expected to have the same bit depth. The X in the name
* represents the expected bit depth of the source and destination PixMaps.
*
* The mask is excpected to be exactly the same size as the rectangle
* that is being copied.
*
*/
void Quick8CopyMask(
PixMapHandle srcMap,
PixMapHandle dstMap,
Ptr mask,
Point srcPt,
Point dstPt,
short width,
short height )
{
register char *src;
register char *dst;
register long srcNewline;
register long dstNewline;
char mode32 = QD32COMPATIBLE;
short w = (width >> 2) - 1;
short e = width % 4 - 1;
short h = height - 1;
// Set up pointers to the beginning of the memory to copy
// and calculate the newline value for the source and destination
src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3FFF) * srcPt.v + srcPt.h;
srcNewline = ((*srcMap)->rowBytes & 0x3FFF) - width;
dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3FFF) * dstPt.v + dstPt.h;
dstNewline = ((*dstMap)->rowBytes & 0x3FFF) - width;
// Switch into 32 bit addressing mode
SwapMMUMode( &mode32 );
// Copy the rect from the source to the destination
asm {
MOVE.W h, D0 ; put height loop variable in D0
MOVEA.L src, A0 ; put the source pixmap address in A0
MOVEA.L dst, A1 ; put the destination address in A1
MOVEA.L mask, A2 ; put the mask address in A2
MOVE.L #0, D3
@1: ; copy the next row
MOVE.W w, D1
@2: ; copy the next four bytes in the row
MOVEQ #0, D2 ; test the next four bits in the mask
BFTST (A2){D3:1} ; test the bit
BEQ @bit2 ; if zero, go to bit 2
ORI.L #0xFF000000, D2 ; else add to pixel mask
@bit2:
ADDQ.L #1, D3 ; increment the bit number
BFTST (A2){D3:1} ; test the bit
BEQ @bit3 ; if zero, go to bit 3
ORI.L #0x00FF0000, D2 ; else add to pixel mask
@bit3:
ADDQ.L #1, D3 ; increment the bit number
BFTST (A2){D3:1} ; test the bit
BEQ @bit4 ; if zero, go to bit 4
ORI.L #0x0000FF00, D2 ; else add to pixel mask
@bit4:
ADDQ.L #1, D3 ; increment the bit number
BFTST (A2){D3:1} ; test the bit
BEQ @inc ; if zero, continue
ORI.L #0x000000FF, D2 ; else add to pixel mask
@inc:
ADDQ.L #1, D3 ; increment the bit number
; speeding this next part up would make a big difference, but how?
MOVE.L D2, D4 ; save the mask
NOT.L D4 ; invert the mask
AND.L (A0)+, D2 ; compute the pixels to be copied
AND.L (A1), D4 ; compute the pixels to be saved
OR.L D2, D4 ; combine the copied and saved pixels
MOVE.L D4, (A1)+ ; copy the pixels
DBF D1, @2
TST.W e
BLT @4 ; continue if e is less than 0
MOVE.W e, D1 ; copy the extra bytes, if any
@3: ; copy the next byte
BFTST (A2){D3:1} ; test the next bit in the mask
BEQ @incb ; if zero, continue
MOVE.B (A0)+, (A1)+ ; else copy the pixel
@incb:
ADDQ.L #1, D3 ; increment the bit number
DBF D1, @3
@4:
ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
DBF D0, @1
}
// Switch back to the previous addressing mode
SwapMMUMode( &mode32 );
}
I'm new to assembly, as you can probably tell. I'm open to all suggestions.
Thanks,
Mike.
- --
_____________________________________________________________________________
Michael A. Kelly Senior Partner
mkelly@cs.uoregon.edu High Risk Ventures
_____________________________________________________________________________
+++++++++++++++++++++++++++
From: jmunkki@vipunen.hut.fi (Juri Munkki)
Date: 14 Nov 92 20:08:31 GMT
Organization: Helsinki University of Technology
In article <1992Nov14.091905.29520@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
>Hey, all you assembly hackers! How can I make this routine faster? As it
>is it's only about 40% faster than CopyMask. (I'm using Think C 5.)
This sounds like something I might be able to help with... let's see...
> src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3FFF) * srcPt.v + srcPt.h;
Shouldn't you cast to long before the multiply? It looks to me like you are
casting the result of a short multiply, but I could be wrong, since I don't
want to check this from a C book right now.
> MOVE.W h, D0 ; put height loop variable in D0
> MOVEA.L src, A0 ; put the source pixmap address in A0
> MOVEA.L dst, A1 ; put the destination address in A1
> MOVEA.L mask, A2 ; put the mask address in A2
> MOVE.L #0, D3
>
> @1: ; copy the next row
> MOVE.W w, D1
>
> @2: ; copy the next four bytes in the row
>
> MOVEQ #0, D2 ; test the next four bits in the mask
> BFTST (A2){D3:1} ; test the bit
> BEQ @bit2 ; if zero, go to bit 2
> ORI.L #0xFF000000, D2 ; else add to pixel mask
> @bit2:
> ADDQ.L #1, D3 ; increment the bit number
> BFTST (A2){D3:1} ; test the bit
> BEQ @bit3 ; if zero, go to bit 3
> ORI.L #0x00FF0000, D2 ; else add to pixel mask
> @bit3:
> ADDQ.L #1, D3 ; increment the bit number
> BFTST (A2){D3:1} ; test the bit
> BEQ @bit4 ; if zero, go to bit 4
> ORI.L #0x0000FF00, D2 ; else add to pixel mask
> @bit4:
> ADDQ.L #1, D3 ; increment the bit number
> BFTST (A2){D3:1} ; test the bit
> BEQ @inc ; if zero, continue
> ORI.L #0x000000FF, D2 ; else add to pixel mask
> @inc:
> ADDQ.L #1, D3 ; increment the bit number
Instead of the above code, extract as many bits as you want (I suggest
8 bits, but 4 is also ok) and then use this number as an index to a
table of precalculated masks.
8 bits is fast, because you don't need to use bitfield instructions to
retrieve the value. You then grab two masks from bitfield tables, so
you avoid all that ORI.L stuff and the increments to the bit numbers.
>; speeding this next part up would make a big difference, but how?
You could take care of longword alignment on all reads and writes to
the destination buffer. This requires quite a bit of extra code, but it
might be worth it, since memory accesses can become twice as fast for
the video memory, which is usually very slow. Of course now that the
processors have data caches (unlike the 68020), it's probably not all
that critical.
Another possibility is to grab just a few mask bits (like 4, as I
suggested) at a time and write special code for all the 16 possible
cases. Use a jump table to select the code to use. In the usual case,
where the mask is all black, you get a simple Move.l (An)+,(An)+, which
really should do wonders to this routine. You also have 3 cases where
you do a Move.w (An)+,(An)+ with some adjustments to the registers, 4
cases of move.b, one case where you don't do anything, so that only
leaves you with 7 more complicated cases, where you might want to use a
constant mask.
I think you can get fairly good performance if you carefully code the
two cases where you have an empty mask or a full mask. The rest occur
less often.
You routine wastes most of its time in the mask handling code. You could
have tested for this by doing timing tests where you replace some part
of the code with fast dummy code and compare the relative speeds. Those
parts that execute much faster as the dummy version need more attention
than those where the difference is small.
- --
Juri Munkki Windsurf: fast sailing
jmunkki@hut.fi Macintosh: fast software
+++++++++++++++++++++++++++
From: mkelly@mystix.cs.uoregon.edu (Michael A. Kelly)
Organization: University of Oregon Computer and Information Sciences Dept.
Date: Mon, 16 Nov 1992 01:48:50 GMT
In article <1992Nov14.200831.20477@nntp.hut.fi> jmunkki@vipunen.hut.fi (Juri Munkki) writes:
>In article <1992Nov14.091905.29520@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
>>Hey, all you assembly hackers! How can I make this routine faster? As it
>>is it's only about 40% faster than CopyMask. (I'm using Think C 5.)
>
>This sounds like something I might be able to help with... let's see...
>
>> src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3FFF) * srcPt.v + srcPt.h;
>
>Shouldn't you cast to long before the multiply? It looks to me like you are
>casting the result of a short multiply, but I could be wrong, since I don't
>want to check this from a C book right now.
Yep, but if you look closely at the parens, I think you'll find that that's
what I'm doing.
>Another possibility is to grab just a few mask bits (like 4, as I
>suggested) at a time and write special code for all the 16 possible
>cases. Use a jump table to select the code to use.
OK, I did that, and managed to almost triple the speed of my original routine,
making the new routine about four times as fast as CopyMask. And yet, I'd
like to make it even faster. So suggestions are welcome.
Someone else suggested that I just make the mask the same depth as the pixmaps,
so that I could use the mask directly instead of having to extract bits from
it. This turned out to be slower than the jump table approach, only about
three times as fast as CopyMask. Of course, the problem could be with my
assembly skills rather than with the theory.
So, here are the resulting routines. The first uses the jump table approach,
the second uses the wide mask approach. Can they be made even faster??
/*
* Quick8CopyMask
*
* The QuickXCopyMask family are much faster versions of CopyMask
* that don't do clipping, dithering, etc. The source and destination
* PixMaps are expected to have the same bit depth. The X in the name
* represents the expected bit depth of the source and destination PixMaps.
*
* The mask is expected to be exactly the same size as the rectangle
* that is being copied.
*
*/
void Quick8CopyMask(
PixMapHandle srcMap,
PixMapHandle dstMap,
Ptr mask,
Point srcPt,
Point dstPt,
short width,
short height )
{
register char *src;
register char *dst;
register long srcNewline;
register long dstNewline;
char mode32 = QD32COMPATIBLE;
short w = (width >> 3) - 1;
short e = (width & 0x07) - 1;
short h = height - 1;
// Set up pointers to the beginning of the memory to copy
// and calculate the newline value for the source and destination
src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3fff) * srcPt.v + srcPt.h;
srcNewline = ((*srcMap)->rowBytes & 0x3fff) - width;
dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3fff) * dstPt.v + dstPt.h;
dstNewline = ((*dstMap)->rowBytes & 0x3fff) - width;
// Switch into 32 bit addressing mode
SwapMMUMode( &mode32 );
// Copy the rect from the source to the destination
asm {
MOVE.W h, D0 ; put height loop variable in D0
MOVEA.L src, A0 ; put the source pixmap address in A0
MOVEA.L dst, A1 ; put the destination address in A1
MOVEA.L mask, A2 ; put the mask address in A2
@1: ; copy the next row
MOVE.W w, D1
@2: ; copy the next eight bytes in the row
MOVE.B (A2), D2 ; copy the next mask byte
TST.B D2
BEQ @nocopy ; if zero, don't copy anything
CMPI.B #0xFF, D2
BNE @hardway ; don't copy everything
MOVE.L (A0)+, (A1)+ ; copy all bytes
MOVE.L (A0)+, (A1)+
ADDQ.L #1, A2
JMP @endloop
@nocopy: ; copy no bytes
ADDQ.L #8, A0
ADDQ.L #8, A1
ADDQ.L #1, A2
JMP @endloop
@hardway:
ANDI.L #0xF0, D2 ; mask off the low four bits
LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
ADD.W D2, D2 ; double the index
ADD.W @table(D2.W), D2 ; calculate the address
JSR @table(D2.W) ; plot four pixels
CLR.L D2 ; clear the mask register
MOVE.B (A2)+, D2 ; copy the next mask byte
ANDI.B #0xF, D2 ; mask off the high four bits
ADD.W D2, D2 ; double the index
ADD.W @table(D2.W), D2 ; calculate the address
JSR @table(D2.W) ; plot four pixels
@endloop:
DBF D1, @2
TST.W e
BLT @4 ; continue if e is less than 0
MOVE.W e, D1 ; copy the extra bytes, if any
@3: ; copy the next byte
MOVEQ.L #0, D3 ; initialize the bit counter
BTST D3, (A2) ; test the next bit in the mask
BEQ @skip ; if zero, continue
MOVE.B (A0)+, (A1)+ ; else copy the pixel
JMP @incb
@skip:
ADDQ.L #1, A0
ADDQ.L #1, A1
@incb:
ADDQ.L #1, D3 ; increment the bit number
DBF D1, @3
@4:
ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
DBF D0, @1
JMP @end ; skip to the end
@table:
DC.W @sub0
DC.W @sub1
DC.W @sub2
DC.W @sub3
DC.W @sub4
DC.W @sub5
DC.W @sub6
DC.W @sub7
DC.W @sub8
DC.W @sub9
DC.W @sub10
DC.W @sub11
DC.W @sub12
DC.W @sub13
DC.W @sub14
DC.W @sub15
@sub0: ; mask = 0000, draw nothing
ADDQ.L #4, A0
ADDQ.L #4, A1
RTS
@sub1: ; mask = 0001
ADDQ.L #3, A0
ADDQ.L #3, A1
MOVE.B (A0)+, (A1)+
RTS
@sub2: ; mask = 0010
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.B (A0)+, (A1)+
ADDQ.L #1, A0
ADDQ.L #1, A1
RTS
@sub3: ; mask = 0011
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.W (A0)+, (A1)+
RTS
@sub4: ; mask = 0100
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0)+, (A1)+
ADDQ.L #2, A0
ADDQ.L #2, A1
RTS
@sub5: ; mask = 0101
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0)+, (A1)+
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0)+, (A1)+
RTS
@sub6: ; mask = 0110
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0)+, (A1)+
ADDQ.L #1, A0
ADDQ.L #1, A1
RTS
@sub7: ; mask = 0111
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0)+, (A1)+
MOVE.W (A0)+, (A1)+
RTS
@sub8: ; mask = 1000
MOVE.B (A0)+, (A1)+
ADDQ.L #3, A0
ADDQ.L #3, A1
RTS
@sub9: ; mask = 1001
MOVE.B (A0)+, (A1)+
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.B (A0)+, (A1)+
RTS
@sub10: ; mask = 1010
MOVE.B (A0)+, (A1)+
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0)+, (A1)+
ADDQ.L #1, A0
ADDQ.L #1, A1
RTS
@sub11: ; mask = 1011
MOVE.B (A0)+, (A1)+
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.W (A0)+, (A1)+
RTS
@sub12: ; mask = 1100
MOVE.W (A0)+, (A1)+
ADDQ.L #2, A0
ADDQ.L #2, A1
RTS
@sub13: ; mask = 1101
MOVE.W (A0)+, (A1)+
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0)+, (A1)+
RTS
@sub14: ; mask = 1110
MOVE.W (A0)+, (A1)+
MOVE.B (A0)+, (A1)+
ADDQ.L #1, A0
ADDQ.L #1, A1
RTS
@sub15: ; mask = 1111
MOVE.L (A0)+, (A1)+
RTS
@end:
}
// Switch back to the previous addressing mode
SwapMMUMode( &mode32 );
}
And the wide mask approach:
void Quick8CopyMask(
PixMapHandle srcMap,
PixMapHandle dstMap,
Ptr mask,
Point srcPt,
Point dstPt,
short width,
short height )
{
register char *src;
register char *dst;
register long srcNewline;
register long dstNewline;
char mode32 = QD32COMPATIBLE;
short w = (width >> 2) - 1;
short e = (width & 0x3) - 1;
short h = height - 1;
// Set up pointers to the beginning of the memory to copy
// and calculate the newline value for the source and destination
src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3fff) * srcPt.v + srcPt.h;
srcNewline = ((*srcMap)->rowBytes & 0x3fff) - width;
dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3fff) * dstPt.v + dstPt.h;
dstNewline = ((*dstMap)->rowBytes & 0x3fff) - width;
// Switch into 32 bit addressing mode
SwapMMUMode( &mode32 );
// Copy the rect from the source to the destination
asm {
MOVE.W h, D0 ; put height loop variable in D0
MOVEA.L src, A0 ; put the source pixmap address in A0
MOVEA.L dst, A1 ; put the destination address in A1
MOVEA.L mask, A2 ; put the mask address in A2
@1: ; copy the next row
MOVE.W w, D1
@2: ; copy the next four bytes in the row
MOVE.L (A2)+, D2 ; copy the mask to D2
MOVE.L D2, D4 ; save the mask
NOT.L D4 ; invert the mask
AND.L (A0)+, D2 ; compute the pixels to be copied
AND.L (A1), D4 ; compute the pixels to be saved
OR.L D2, D4 ; combine the copied and saved pixels
MOVE.L D4, (A1)+ ; copy the pixels
DBF D1, @2
TST.W e
BLT @4 ; continue if e is less than 0
MOVE.W e, D1 ; copy the extra bytes, if any
@3: ; copy the next byte
MOVE.B (A2)+, D2 ; copy the mask to D2
MOVE.B D2, D4 ; save the mask
NOT.B D4 ; invert the mask
AND.B (A0)+, D2 ; compute the pixels to be copied
AND.B (A1), D4 ; compute the pixels to be saved
OR.B D2, D4 ; combine the copied and saved pixels
MOVE.B D4, (A1)+ ; copy the pixels
DBF D1, @3
@4:
ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
DBF D0, @1
}
// Switch back to the previous addressing mode
SwapMMUMode( &mode32 );
}
- --
_____________________________________________________________________________
Michael A. Kelly Senior Partner
mkelly@cs.uoregon.edu High Risk Ventures
_____________________________________________________________________________
+++++++++++++++++++++++++++
From: jmunkki@vipunen.hut.fi (Juri Munkki)
Date: 16 Nov 92 19:09:47 GMT
Organization: Helsinki University of Technology
In article <1992Nov16.014850.28678@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
>So, here are the resulting routines. The first uses the jump table approach,
>the second uses the wide mask approach. Can they be made even faster??
Yes.
> @2: ; copy the next eight bytes in the row
>
> MOVE.B (A2), D2 ; copy the next mask byte
>
> TST.B D2
A move instruction always does an implied tst, so you can just throw away
the test instruction.
> BEQ @nocopy ; if zero, don't copy anything
>
> CMPI.B #0xFF, D2
> BNE @hardway ; don't copy everything
An addq.w #1, and then a beq might prove to be faster than the cmp with
an immediate value. You have to adjust the mask back to its old value,
if the test fails, but this can be done either with the jump tables
(not with the ones you are using now, but the longer ones I will suggest
later in this article) or by a subq.w #1
>
> MOVE.L (A0)+, (A1)+ ; copy all bytes
> MOVE.L (A0)+, (A1)+
> ADDQ.L #1, A2
Do a move.b (A2)+ instead of this instruction. I can't see any reason why
you can't do the increment there.
> JMP @endloop
Copy the end of the loop here. So that you have the DBF instruction here
instead of a JMP. Put the jump after the DBF. There's absolutely no reason
to jump around when you can just use another DBF.
> @nocopy: ; copy no bytes
> ADDQ.L #8, A0
> ADDQ.L #8, A1
> ADDQ.L #1, A2
> JMP @endloop
Same here as above.
> @hardway:
> ANDI.L #0xF0, D2 ; mask off the low four bits
> LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
The AND is totally wasted. The LSR will do the masking for you. This
is assuming that you can keep the high bytes of D2 cleared. I think
you should be able to do it. (I think it's already that way.)
You can also eliminate the and and lsr, if you use two 256-entry jump
tables that simply ignore the high or low 4 bits. The tables will take
some memory (2 x 4 x 256 bytes), but they are easy to construct with
copy and paste.
> ADD.W D2, D2 ; double the index
> ADD.W @table(D2.W), D2 ; calculate the address
> JSR @table(D2.W) ; plot four pixels
The 68020 has addressing modes that do the multiplication of the index.
I haven't needed them myself, but I'm fairly certain that you can improve
this part with the right addressing mode.
Replace the jsr with a LEA An to the return address and a JMP to the
subroutine. Then jump back with a JMP (An). This is quite a bit faster
than a JSR/RTS combination, although it's not "good style".
> CLR.L D2 ; clear the mask register
> MOVE.B (A2)+, D2 ; copy the next mask byte
> ANDI.B #0xF, D2 ; mask off the high four bits
Use BFEXTU, if you must read the mask again. Remember that you can use
- -1(A2), if you already incremented A2 or you might be able to account
for this with the bitfield offset. You can also use constant bitfield
offsets, if I remember correctly. I think you have some registers that
you could use, so you could store fairly constant bitfield indices
there.
> @sub6: ; mask = 0110
> ADDQ.L #1, A0
> ADDQ.L #1, A1
> MOVE.B (A0)+, (A1)+
This should be a move.w
> ADDQ.L #1, A0
> ADDQ.L #1, A1
> RTS
>
> @sub8: ; mask = 1000
> MOVE.B (A0)+, (A1)+
> ADDQ.L #3, A0
> ADDQ.L #3, A1
> RTS
A move.b (a0),(a1) along with addq #4 is faster on a 68000, but I
don't think it matters on new processors. I may be wrong, but you'll
probably never see the difference.
In the deep mask version, you could unroll the loop. It's kind of
surprising the the 1 bit mask is actually faster, but it's mostly
because of the superior algorithm that allows you to directly copy
8 bytes at a time in the most common case.
I think you did really well with the assembly. My changes will probably
not make a big difference. I think 5% is the best you can hope for, but
it might be as much as 10%. The only way to go beyond this is to make
the move.l commands aligned on long word destinations, as I mentioned
in my previous article.
I hope my articles offer proof for the other half of my .signature... :-)
Can anyone do significantly better? I really love optimizing graphics
routines.
- --
Juri Munkki Windsurf: fast sailing
jmunkki@hut.fi Macintosh: fast software
+++++++++++++++++++++++++++
From: mkelly@mystix.cs.uoregon.edu (Michael A. Kelly)
Organization: High Risk Ventures
Date: Wed, 18 Nov 1992 01:08:15 GMT
In article <1992Nov16.190947.9920@nntp.hut.fi> jmunkki@vipunen.hut.fi (Juri Munkki) writes:
>In article <1992Nov16.014850.28678@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
>> CMPI.B #0xFF, D2
>> BNE @hardway ; don't copy everything
>
>An addq.w #1, and then a beq might prove to be faster than the cmp with
>an immediate value. You have to adjust the mask back to its old value,
>if the test fails, but this can be done either with the jump tables
>(not with the ones you are using now, but the longer ones I will suggest
>later in this article) or by a subq.w #1
According to the Motorola manual, you're right. But in practice this slowed
things down quite a bit. I can't figure out why. I replaced the CMPI with
an ADDQ #1, then at @hardway I did a SUBQ #1. My test case is a 32x32 rect
with a 32x32 filled circle as the mask. I think it would slow things down
a lot more with more complicated masks. But still, I don't know why it's
slower, since the CMPI takes 8 clock cycles and the ADDQ and SUBQ each
take 4, so it really should be faster.... Then again, those timings are
for the 68000 (and 68020 too I think), and I'm using a 68040.
>> @hardway:
>> ANDI.L #0xF0, D2 ; mask off the low four bits
>> LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
>
>The AND is totally wasted. The LSR will do the masking for you.
:) I don't know *what* I was thinking....
At this point, I ran my test again with the above modifications. They
improved the speed by about 10%. (Changing the CMPI above decreased
performance by about 30% with these other changes also in place.)
>You can also eliminate the and and lsr, if you use two 256-entry jump
>tables that simply ignore the high or low 4 bits. The tables will take
>some memory (2 x 4 x 256 bytes), but they are easy to construct with
>copy and paste.
You mean two 16-entry jump tables, right? I didn't implement this, but
instead made a separate CopyMask function that used a single 256-entry
jump table, with 256 subroutines for each of the 256 possible mask-bytes.
See the code fragment below.
Hey, maybe I could just save 256 (times two) masks, AND each mask with the
source and destination bytes, then OR those two results together to get
the resulting pixel. Hmmm, I wonder if it would be even faster....
>> ADD.W D2, D2 ; double the index
>> ADD.W @table(D2.W), D2 ; calculate the address
>> JSR @table(D2.W) ; plot four pixels
>
>The 68020 has addressing modes that do the multiplication of the index.
>I haven't needed them myself, but I'm fairly certain that you can improve
>this part with the right addressing mode.
Nope, I don't think so. You're talking about Address Register Indirect with
Offset and Index, like so: @table( <no address in this case>, D2.W*2 ).
The problem is that the value of D2 is preserved in that operation, so instead
of D2 = (D2 * 2) + @table + (D2 * 2), you get D2 = D2 + @table + (D2 * 2).
>Replace the jsr with a LEA An to the return address and a JMP to the
>subroutine. Then jump back with a JMP (An). This is quite a bit faster
>than a JSR/RTS combination, although it's not "good style".
Wow, that made a big difference! About a 17% improvement, making the total
speedup about 25%.
>> CLR.L D2 ; clear the mask register
>> MOVE.B (A2)+, D2 ; copy the next mask byte
>> ANDI.B #0xF, D2 ; mask off the high four bits
>
>Use BFEXTU, if you must read the mask again. Remember that you can use
>-1(A2), if you already incremented A2 or you might be able to account
>for this with the bitfield offset. You can also use constant bitfield
>offsets, if I remember correctly. I think you have some registers that
>you could use, so you could store fairly constant bitfield indices
>there.
I'm not sure what you mean by constant offsets. I did this:
BFEXTU -1(A2){4:4}
and it slowed it down by about 4%.
>> @sub8: ; mask = 1000
>> MOVE.B (A0)+, (A1)+
>> ADDQ.L #3, A0
>> ADDQ.L #3, A1
>> RTS
>
>A move.b (a0),(a1) along with addq #4 is faster on a 68000, but I
>don't think it matters on new processors. I may be wrong, but you'll
>probably never see the difference.
You're right, it didn't make any difference at all on my '040.
>In the deep mask version, you could unroll the loop. It's kind of
>surprising the the 1 bit mask is actually faster, but it's mostly
>because of the superior algorithm that allows you to directly copy
>8 bytes at a time in the most common case.
I tossed that code. I don't really think unrolling the loop will get it
down to my current speed, which is more than twice as fast.
>it might be as much as 10%. The only way to go beyond this is to make
>the move.l commands aligned on long word destinations, as I mentioned
>in my previous article.
But as long as I align the source and destination Pixmaps, that isn't an
issue, right?
>I hope my articles offer proof for the other half of my .signature... :-)
Definitely :)
OK, here's the new code. The first one is the newer, better version of
Quick8CopyMask, with most of the optimizations suggested by Juri. It's
about 5.5 times as fast as QuickDraw's CopyMask, at least with my simple
circle mask test case. The second one is a small part of a very large
Quick8CopyMask that has 256 separate subroutines to handle each mask
byte, rather than only 16 subroutines to handle a mask nibble (a nibble is
half a byte, right?). It's far too long to post here, but if you want a
copy I'll be happy to email it to you. It's about 6.5 times as fast as
CopyMask; about 15% faster than the short version.
I tested the routines with the mask used in the CalcCMask DTS snippet;
the short version was 5.7 times as fast as CopyMask and the long version
was 7 times as fast.
And once again, if anyone can improve on these routines, please tell me how!
void Quick8CopyMask(
PixMapHandle srcMap,
PixMapHandle dstMap,
Ptr mask,
Point srcPt,
Point dstPt,
short width,
short height )
{
register char *src;
register char *dst;
register long srcNewline;
register long dstNewline;
char mode32 = QD32COMPATIBLE;
short w = (width >> 3) - 1;
short e = (width & 0x07) - 1;
short h = height - 1;
// Set up pointers to the beginning of the memory to copy
// and calculate the newline value for the source and destination
src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3fff) * srcPt.v + srcPt.h;
srcNewline = ((*srcMap)->rowBytes & 0x3fff) - width;
dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3fff) * dstPt.v + dstPt.h;
dstNewline = ((*dstMap)->rowBytes & 0x3fff) - width;
// Switch into 32 bit addressing mode
SwapMMUMode( &mode32 );
// Copy the rect from the source to the destination
asm {
MOVE.W h, D0 ; put height loop variable in D0
MOVEA.L src, A0 ; put the source pixmap address in A0
MOVEA.L dst, A1 ; put the destination address in A1
MOVEA.L mask, A2 ; put the mask address in A2
CLR.L D2 ; clear the mask register
@1: ; copy the next row
MOVE.W w, D1
@2: ; copy the next eight bytes in the row
MOVE.B (A2)+, D2 ; copy the next mask byte
BEQ @nocopy ; if zero, don't copy anything
CMPI.B #0xFF, D2
BNE @hardway ; don't copy everything
MOVE.L (A0)+, (A1)+ ; copy all bytes
MOVE.L (A0)+, (A1)+
DBF D1, @2
JMP @endloop
@nocopy: ; copy no bytes
ADDQ.L #8, A0
ADDQ.L #8, A1
DBF D1, @2
JMP @endloop
@hardway:
LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
ADD.W D2, D2 ; double the index
ADD.W @table(D2.W), D2 ; calculate the address
LEA @rts1, A3 ; save the return address
JMP @table(D2.W) ; plot four pixels
@rts1:
MOVE.B -1(A2), D2 ; copy the next mask byte
ANDI.B #0xF, D2 ; mask off the high four bits
ADD.W D2, D2 ; double the index
ADD.W @table(D2.W), D2 ; calculate the address
LEA @rts2, A3 ; save the return address
JMP @table(D2.W) ; plot four pixels
@rts2:
DBF D1, @2
@endloop:
TST.W e
BLT @4 ; continue if e is less than 0
MOVE.B (A2)+, D2 ; copy the next mask byte
MOVE.W e, D1 ; initialize the loop counter
MOVEQ.L #7, D3 ; initialize the bit counter
@3: ; copy the next byte
BTST D3, D2 ; test the next bit in the mask
BEQ @skip ; if zero, continue
MOVE.B (A0)+, (A1)+ ; else copy the pixel
SUBQ.L #1, D3 ; decrement the bit counter
DBF D1, @3
JMP @4
@skip:
ADDQ.L #1, A0
ADDQ.L #1, A1
SUBQ.L #1, D3 ; decrement the bit counter
DBF D1, @3
@4:
ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
DBF D0, @1
JMP @end ; skip to the end
@table:
DC.W @sub0
DC.W @sub1
DC.W @sub2
DC.W @sub3
DC.W @sub4
DC.W @sub5
DC.W @sub6
DC.W @sub7
DC.W @sub8
DC.W @sub9
DC.W @sub10
DC.W @sub11
DC.W @sub12
DC.W @sub13
DC.W @sub14
DC.W @sub15
@sub0: ; mask = 0000, draw nothing
ADDQ.L #4, A0
ADDQ.L #4, A1
JMP (A3) ; RTS
@sub1: ; mask = 0001
ADDQ.L #3, A0
ADDQ.L #3, A1
MOVE.B (A0)+, (A1)+
JMP (A3) ; RTS
@sub2: ; mask = 0010
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
JMP (A3) ; RTS
@sub3: ; mask = 0011
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.W (A0)+, (A1)+
JMP (A3) ; RTS
@sub4: ; mask = 0100
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
JMP (A3) ; RTS
@sub5: ; mask = 0101
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.B (A0)+, (A1)+
JMP (A3) ; RTS
@sub6: ; mask = 0110
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.W (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
JMP (A3) ; RTS
@sub7: ; mask = 0111
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0)+, (A1)+
MOVE.W (A0)+, (A1)+
JMP (A3) ; RTS
@sub8: ; mask = 1000
MOVE.B (A0), (A1)
ADDQ.L #4, A0
ADDQ.L #4, A1
JMP (A3) ; RTS
@sub9: ; mask = 1001
MOVE.B (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
MOVE.B (A0)+, (A1)+
JMP (A3) ; RTS
@sub10: ; mask = 1010
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
JMP (A3) ; RTS
@sub11: ; mask = 1011
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.W (A0)+, (A1)+
JMP (A3) ; RTS
@sub12: ; mask = 1100
MOVE.W (A0), (A1)
ADDQ.L #4, A0
ADDQ.L #4, A1
JMP (A3) ; RTS
@sub13: ; mask = 1101
MOVE.W (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
MOVE.B (A0)+, (A1)+
JMP (A3) ; RTS
@sub14: ; mask = 1110
MOVE.W (A0)+, (A1)+
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
JMP (A3) ; RTS
@sub15: ; mask = 1111
MOVE.L (A0)+, (A1)+
JMP (A3) ; RTS
@end:
}
// Switch back to the previous addressing mode
SwapMMUMode( &mode32 );
}
And this is the extremely long version, truncated for this posting:
void Quick8CopyMask(
PixMapHandle srcMap,
PixMapHandle dstMap,
Ptr mask,
Point srcPt,
Point dstPt,
short width,
short height )
{
register char *src;
register char *dst;
register long srcNewline;
register long dstNewline;
char mode32 = QD32COMPATIBLE;
short w = (width >> 3) - 1;
short e = (width & 0x07) - 1;
short h = height - 1;
// Set up pointers to the beginning of the memory to copy
// and calculate the newline value for the source and destination
src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3fff) * srcPt.v + srcPt.h;
srcNewline = ((*srcMap)->rowBytes & 0x3fff) - width;
dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3fff) * dstPt.v + dstPt.h;
dstNewline = ((*dstMap)->rowBytes & 0x3fff) - width;
// Switch into 32 bit addressing mode
SwapMMUMode( &mode32 );
// Copy the rect from the source to the destination
asm {
MOVE.W h, D0 ; put height loop variable in D0
MOVEA.L src, A0 ; put the source pixmap address in A0
MOVEA.L dst, A1 ; put the destination address in A1
MOVEA.L mask, A2 ; put the mask address in A2
CLR.L D2 ; clear the mask register
@1: ; copy the next row
MOVE.W w, D1
@2: ; copy the next eight bytes in the row
CLR.W D2 ; clear the mask register
MOVE.B (A2)+, D2 ; copy the next mask byte
BEQ @nocopy ; if zero, don't copy anything
CMPI.B #0xFF, D2
BNE @hardway ; don't copy everything
MOVE.L (A0)+, (A1)+ ; copy all bytes
MOVE.L (A0)+, (A1)+
DBF D1, @2
JMP @endloop
@nocopy: ; copy no bytes
ADDQ.L #8, A0
ADDQ.L #8, A1
DBF D1, @2
JMP @endloop
@hardway:
ADD.W D2, D2 ; double the index
ADD.W @table(D2.W), D2 ; calculate the address
JMP @table(D2.W) ; plot eight pixels
@endloop:
TST.W e
BLT @4 ; continue if e is less than 0
MOVE.B (A2)+, D2 ; copy the next mask byte
MOVE.W e, D1 ; initialize the loop counter
MOVEQ.L #7, D3 ; initialize the bit counter
@3: ; copy the next byte
BTST D3, D2 ; test the next bit in the mask
BEQ @skip ; if zero, continue
MOVE.B (A0)+, (A1)+ ; else copy the pixel
SUBQ.L #1, D3 ; decrement the bit counter
DBF D1, @3
JMP @4
@skip:
ADDQ.L #1, A0
ADDQ.L #1, A1
SUBQ.L #1, D3 ; decrement the bit counter
DBF D1, @3
@4:
ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
DBF D0, @1
JMP @end ; skip to the end
@table:
DC.W @sub0
DC.W @sub1
DC.W @sub2
DC.W @sub3
. .
. .
. .
DC.W @sub253
DC.W @sub254
DC.W @sub255
@sub0: ; mask = 00000000
ADDQ.L #8, A0
ADDQ.L #8, A1
DBF D1, @2
JMP @endloop
@sub1: ; mask = 00000001
ADDQ.L #7, A0
ADDQ.L #7, A1
MOVE.B (A0)+, (A1)+
DBF D1, @2
JMP @endloop
@sub2: ; mask = 00000010
ADDQ.L #6, A0
ADDQ.L #6, A1
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
DBF D1, @2
JMP @endloop
. .
. .
. .
@sub182: ; mask = 10110110
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.W (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
MOVE.W (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
DBF D1, @2
JMP @endloop
@sub183: ; mask = 10110111
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.W (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
MOVE.B (A0)+, (A1)+
MOVE.W (A0)+, (A1)+
DBF D1, @2
JMP @endloop
. .
. .
. .
@sub253: ; mask = 11111101
MOVE.L (A0)+, (A1)+
MOVE.W (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
MOVE.B (A0)+, (A1)+
DBF D1, @2
JMP @endloop
@sub254: ; mask = 11111110
MOVE.L (A0)+, (A1)+
MOVE.W (A0)+, (A1)+
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
DBF D1, @2
JMP @endloop
@sub255: ; mask = 11111111
MOVE.L (A0)+, (A1)+
MOVE.L (A0)+, (A1)+
DBF D1, @2
JMP @endloop
@end:
}
// Switch back to the previous addressing mode
SwapMMUMode( &mode32 );
}
- --
_____________________________________________________________________________
Michael A. Kelly Senior Partner
mkelly@cs.uoregon.edu High Risk Ventures
_____________________________________________________________________________
+++++++++++++++++++++++++++
From: jmunkki@vipunen.hut.fi (Juri Munkki)
Organization: Helsinki University of Technology
Date: Wed, 18 Nov 1992 20:33:45 GMT
In article <1992Nov18.010815.6649@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
> According to the Motorola manual, you're right. But in practice this slowed
> things down quite a bit. I can't figure out why. I replaced the CMPI with
> an ADDQ #1, then at @hardway I did a SUBQ #1. My test case is a 32x32 rect
> with a 32x32 filled circle as the mask. I think it would slow things down
> a lot more with more complicated masks. But still, I don't know why it's
> slower, since the CMPI takes 8 clock cycles and the ADDQ and SUBQ each
> take 4, so it really should be faster.... Then again, those timings are
> for the 68000 (and 68020 too I think), and I'm using a 68040.
On the 040, the instructions can overlap quite a bit. I guess that the
modification of a data register prevented the overlap. I suggest that
you try storing the constant 0xFF in a free data register and doing
the compare with the data register. Register to register compares should
always be faster than immediate to register compares.
> >it might be as much as 10%. The only way to go beyond this is to make
> >the move.l commands aligned on long word destinations, as I mentioned
> >in my previous article.
>
> But as long as I align the source and destination Pixmaps, that isn't an
> issue, right?
I thought about this alignment stuff and it occured to me that the mask
bitmap would be a lot harder to use if you aligned your writes to video
RAM. On the Quadras, video RAM is so fast that alignment probably doesn't
matter all that much. On NuBUS, things are usually quite different.
> OK, here's the new code. The first one is the newer, better version of
> Quick8CopyMask, with most of the optimizations suggested by Juri. It's
> about 5.5 times as fast as QuickDraw's CopyMask, at least with my simple
> circle mask test case. The second one is a small part of a very large
> Quick8CopyMask that has 256 separate subroutines to handle each mask
> byte, rather than only 16 subroutines to handle a mask nibble (a nibble is
> half a byte, right?). It's far too long to post here, but if you want a
> copy I'll be happy to email it to you. It's about 6.5 times as fast as
> CopyMask; about 15% faster than the short version.
>
> I tested the routines with the mask used in the CalcCMask DTS snippet;
> the short version was 5.7 times as fast as CopyMask and the long version
> was 7 times as fast.
It should be quite hard to improve speed from the longer code. I bet it took
quite a few minutes to write it. :-)
I do have an idea that you could try, if you still feel like the code should
be improved.
Snippet from long version:
> @1: ; copy the next row
> MOVE.W w, D1
> @2: ; copy the next eight bytes in the row
> CLR.W D2 ; clear the mask register
> MOVE.B (A2)+, D2 ; copy the next mask byte
> BEQ @nocopy ; if zero, don't copy anything
>
> CMPI.B #0xFF, D2
> BNE @hardway ; don't copy everything
>
> MOVE.L (A0)+, (A1)+ ; copy all bytes
> MOVE.L (A0)+, (A1)+
>
> DBF D1, @2
> JMP @endloop
>
> @nocopy: ; copy no bytes
> ADDQ.L #8, A0
> ADDQ.L #8, A1
>
> DBF D1, @2
> JMP @endloop
>
> @hardway:
> ADD.W D2, D2 ; double the index
> ADD.W @table(D2.W), D2 ; calculate the address
> JMP @table(D2.W) ; plot eight pixels
I finally dug up my 020 manual and went through the addressing modes.
Instead of having a jump table, you should probably use a table of jumps. :-)
clr.w D2
@1
move.w w,D1
@2
move.b (A2)+,D2
jmp (@jumptable,PC,D2.w*4)
@jumptable bra.w @mask0
bra.w @mask1
bra.w @mask2
bra.w @mask3
...
bra.w @mask254
move.l (A0)+,(A1)+ ; This is mask 255
move.l (A0)+,(A1)+
dbf D1,@2
...
I checked with Think C and at least the above code (or something similar)
to it compiles and the disassembly looks reasonable.
Note that i removed the special checks for 0 and 255. I think they are
mostly wasted, but it's possible they speed things with masks with large
solid areas.
- --
Juri Munkki Windsurf: fast sailing
jmunkki@hut.fi Macintosh: fast software
+++++++++++++++++++++++++++
From: mxmora@unix.SRI.COM (Matt Mora)
Date: 19 Nov 92 17:43:28 GMT
Organization: SRI International, Menlo Park, California
In article <1992Nov18.010815.6649@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
>
>And once again, if anyone can improve on these routines, please tell me how!
If your going to be calling this function a lot (like in a tight loop
in a game to plot sprites) You can move the Swapmmumode code out of the
function and call before you make the call and restore it afterward.
That takes the two trap calls out of your fast code.
like:
swapmmumode(mode);
while plottingsprites
QuickCopy(sprites[i++]);
swapmmumode(backtowhatiswas);
If you can find a way to precompute the addresses (like a table of row
starting addresses) that might help. They mention stuff like this in one
of the develop articles.
Matt
- --
___________________________________________________________
Matthew Mora | my Mac Matt_Mora@sri.com
SRI International | my unix mxmora@unix.sri.com
___________________________________________________________
+++++++++++++++++++++++++++
From: mkelly@mystix.cs.uoregon.edu (Michael A. Kelly)
Organization: University of Oregon Computer and Information Sciences Dept.
Date: Fri, 20 Nov 1992 02:13:40 GMT
>On the 040, the instructions can overlap quite a bit. I guess that the
>modification of a data register prevented the overlap. I suggest that
>you try storing the constant 0xFF in a free data register and doing
>the compare with the data register. Register to register compares should
>always be faster than immediate to register compares.
I did that, and it doesn't seem to make any difference. My timings are to
the 1/10000 of a tick.
>It should be quite hard to improve speed from the longer code. I bet it took
>quite a few minutes to write it. :-)
About 120, although I didn't do it all in one sitting so it's hard to say :^/
>I do have an idea that you could try, if you still feel like the code should
>be improved.
>Instead of having a jump table, you could probably use a table of jumps. :-)
It did make a very slight difference in the long version, but no difference
it the short version. I think this is mostly because I still have to deal
with the mask in the short version, so it really didn't change much.
>Note that I removed the special checks for 0 and 255. I think they are
>mostly wasted, but it's possible they speed things with masks with large
>solid areas.
Removing them makes the long version faster, and the short version slower.
Here's the new long version, in its entirety. Enjoy.
(This file must be converted with BinHex 4.0)
:$%0[F(P0BA0V,Q0`G!"338083e"$9!!!!!!@#!!!!!$YpJ%"!2%!!"A1J$CQGfG
BB!L!H!F'!)F!"JF!#!K`!!K3L!F!!!!!C`H!F)GQ"SGQG`L'!!F)#(!'D'CRD
)#)KhChKi!(J!K`L)J!!!!!#!#!"h!!#!B!!!!!!!#!#!!!!!!)#!!(!!!!!!!)!
!!(!!!!!)F)!!#!!!!!!!!)!)"`!!J!!!!!!(#!!"0$"8"38&!a)d3)LFJ69!U"0
5P'K&L,NhDL3#G2&%d@Z5Iq1cTGMj9*GERM*hIAjVhRRLZhhYLhhqlAeBq(aTE[p
!f3k5TrAYql(%I[p%jaFm5h-kA8L6djVNaAr-e'ebGAPdBUeSc#BPZF&LYKa@12$
k-9#D*TEA**rm8&aljDd6YIjl9Yr+50bNZe*9p[4d4QA0$4kf'Qf!qk`"4'GVadV
1jYhEfi$H[4M)M2LK1rNrK1hkNlQRk4XH*GLJ6QaXpGlfi[l2db[%#Xf1j%@GI`q
!'8U(q*(+[q2bjHR[B,ZrJ[e2*l+QRCZqVI`Ehf!!!!"#!!"iH+B!"J!!!!!!!!!
!!%!+!)ThLAPiBd9QC9D)#B#(H(LUUU#'UALALEQ3!!!!!!9fCAKPZ'GPB'9AKiQ
C#3!)N!!J!!4&9@ChGN0&4BHSHAL)L(QQCjH(LCZAHDKSX*KiH)-r46099RChGRH
*L)H'4@CQCfChLCQ)PjKiGkL*HBZVUCQUQCQULELCUkQ3!*!!QJLCTkLBQCQCHAU
E#VS!!!#kc!RdHI(,cbq(KEUqGAYZhrYTV[qC%5)JH,il,%dCr9FM-Zk[[YfrY+[
KC,[m0@`L)Ai@F*I5cKIEGZ[PGh5D@#iX0PH%YGflr"8'PUfflppZljFdYGZ'bcJ
NKGQ'VRP)V+ll[$KUXPAZebefAi@lUm#-8+`IVYqZQ[IHNcHckll0@&QZ@&dYPId
XN4K8VkqqbAED4NeQr$BSa$pC@lNNHh%4iV"q&QrKCICZ`[54GH`THR*4qN`pV"8
P!2cY,AqmV9EEf!@l5XEVeB%E3[$Pp[lNBJQcMJ4+UphbfN'9K'KbeAElE0DM&[4
Mar5kh@$dH6'Sd6rC!TYXaAm04%0jJ4EAIJSLf($&p[kiLKdU(A@lX-@)UDpH$5L
"%[54#rCG`+L(MEV`f0+)E,,IPX`Pbiriaib0mq9U89HVCA`aHFM$28ULT')Y5+l
EYhb+cAreRMYYh@+Sf4,fK'dQ"BXAIGVXSR,m*I$USRlIadpI4mIqG2[pa%KALBL
1mL3VPkrA+MPPpmSD8IXD%ImdXReb8R5(B4)9#dQM12&k24,XX`PiEjEbYE*F-,%
19pID3BlYaAbU@QZqc[Zi%LB*F0)pY*1bfTY9Hh9iEDm%lCfj-V5qPHh`XPhAF(G
'U8!%E)3+rkX`*%GdehfG'[A`j**a55jCIG,N+h$PPbFKAF%4APqrem,[(TrE#bp
)c8GhGhFT,ib88A2p#*NQK@`TiA["+eTiX#qq56HNQUNHmX&8ii`#UD+YJ95MR5#
UD&&Bqd4GTK9hX#AK90p2MEJ6UF%I2G)PUNYlZXJDX[[+qBYVmSpl2'[ITdrRT)K
r*,c+4UmUk+dXLbqm9,ZiAGl8q&5,X(b9hpm[i+K1,6q2rhZjrdaE1DA9N!#b6HU
@r``A4TlEVYj)Z1&YID5r!6C!kXLLCp(2lb[-jTG$5Qc5cIIEpHmMQP#D91qKU6X
MBL*h"NlDYPeU6Q'K18YQ)LG6*FlD%j5fGfT-TN6,fqrmL-TkTNC*j286FYXV6Ym
T*DZ[GN5M(LpNHNL5rf)NUja@kcki5)reSE"iNE!8%%5kE@PbEFHe++j6FT+`&F`
,TaFR42PqiM"NafC28JS92#*Irh6eiXA*l-#ZcjZ[fmh92RmI2Pj5*TrriGqpG%K
A[hm,Yj(XN[5hq(E`Zm#0l+Rj"$B6Er)rRdT43I(Xq*!!IP4VTpraaB[CP8DKGZ)
Q*,k&[G)L#hbfPEi5H(G)Mk(r$C4XpK@cTa,@lVF,DpT9U+Q58CHUl`,XSU*Km#2
Jr[54-U'Y-Lh69kBN+p$@lK3!G*!!dBZUK)9*-0)mK1dkQNa''+2@+0LTMl[J4Te
rcYhU-plTI[C`ZjPl@QTSQ3V416SMPqiV4V0YpM3"LA+V0T%Yl2ck8LM$38GGQVK
ChNUL38-A9drj)JT%YpP#k,AfC5'BS5)rV&dGA8ZBHPGkDN3,E*G@ANC2P8C)TI)
8fIdVjj!!aGTFCUGmmNTUN!!GG(hIKA``A3#DEY5M$QME05[a-'(&h"R"KbmDC#$
416T4Xa,MkC`+fYF`FL+Th28V["5ldrPd%YSq`PedVIfK$LX-D%1,X90#(&H@d)F
4jl5KfCJKfGJKe,"$UQ#(D$-q!'Q"d'"m'#%'#-'#8'#F'#N'#X'#dc"DB0N-&TQ
#dc"DCJY-`@QB,6-&TQ#e"JY3B,8!rX`@S-&U$"DJ`@S-&U$"DJ`@bQ#f8`@bQ#f
8)dM"E+B,C6"E+B,C6"E+B,CM"E-B,CM"E-B,CJZ8Q#fB`@c'#fB`@c'#fF`@cQ#
fF`@cQ#fF`@cKH!-&XjJYR-&XjJY5B,8Q#e*JY5B,8Q#e*JY5'`ZB,8Q#e*JY8B,
9'#e4JY8B,9'#e4JY8B,9"Xh'#e4JYS-&Y"JYS-&Y"JYS-&Y"JYS-&Y"JYS$!+I!
- -'!I)'!M)'!R)'!V)'![)'!c)'!h)'!l)'!r)!J9PR!3(PT2,8H@XmYKjE6bh(P[
#A#!PaJ*FS#A1!PdJ*GB#AD!PhJ*H)#AQ!PkJ*Hi#Ab!PpJ*IS#Aq!8#!8'!8+!8
1!85!8@!8D!8H!8L!8Q!8U!8Z!8b!8f!8k!8q!9#!9'!9+!91!95!9@!9D!9H!9L
!9Q!9U!9Z!9b!9f!9k!9q!@#!@'!@+!@1!@5!@@!@D!@H!@L!@Q!@U!@Z!@b!@f!
@k!@q!A#!A'!A+!A1!A5!A@!AD!AH!AL!AQ!AU!AZ!Ab!Af!Ak!Aq!J"-)!6#!%`
J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-)!62Q`2Qa9c
C!)$jY$jY6jYMjYcjZ$jZ3J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"
- -)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-)!6#!%`J"-X!#q!dU&jLpdj,a&qmhL,qh
pFIU6mB[i5JRj0[m5GKhr5XhArH-AaTNmX+E+-KH9lbVf)C%f),[$Y8(L'+L"5Ql
f0+-C5r4eJaRbP6TF`SR-!`Bd9%&hRa$PmIKQ#AQIB-`idYJjFb(1$-5SJ$-8fB0
+56"0#SJi(p,bl!$fTS$`"rk29S'99Q2N6iEec)X`-b+L$Fd%Rk-X%d+L$Jd(-l!
Je0#CK"XHLpl@fGB6-,@l`%bEk"EX`#&,X#$NhCTD%picUP34a%M1DQFj3"*83ER
rYLD-3MSD`Le0DCK&XI8pid)4J3)+'%D6-8-ER!l2TL-MFp+4KGFm5-cYc3LAAL*
HrFAA1fq!q,Fm)Pi+%R&[6Pe%R5h3KEb#Kk@k&Ah$E+IB8)J2a%akF6c&$rfkSZC
SG95G5JV1(lHJUdCXh1Q3!%MUZ8*'f[QB5Bha(*[4eh5&$#0CQ+'DD1XHKi@*NBA
A2&0'Hep-m,#LJfem,HR'&Miaj62#a-M#kiD-KhhPjF20!NA"ZS0p*![L-4McU*N
BAa16VR5l-HHSYmXf3YMHD'8i`521SQ%R&b$PLfG,Ibe"!C!!aQ+"Y@1FTdeU2!b
'#4KP1A#KDM`-KMpl``j)Rfe$`C)K&!j1(Mq+ke'K1ZfiZQ6hd"f$M"*DJ!RHf
I6MfH6HK@DdZ6CU",1P4XjU'3!%9BF*C4fU1TQ%Rlm36)8+`L5bZR$$Q+"lN6qHI
&@CC)M#kjbr`-b»@H-AA$`T2je9+&CPNL-,VKTF(*X[,LXj5A(q,iKUQImq*
@HK)M#q*dSFF[BFV23Pb0Lm3GJ*r,,-UY$L4'&iJ[i(%8[,Le0iF$`P`@S$#Th(T
E#`'9)3N*'GC1&Uc!i4K&$MhXR@6KDX`1'35-2IiGC1&V!!pI3PaM'8-M6l&aDaB
2fZ%Z)64RRVbSVL2TLCfV(aRXbS`h(da-`N1HaEkV)`X(da$X@$RXIJM)`X(da#4
Kcf,K3LU3!2TL(D8(2Br"'46JI6%*'(2Bkh8%8i(da#A"cf2`4N8i(da#4Kcf2%I
)T52TL'KiFDbi8)SH2Ti%8$LiH)q@SdIV0J1)393H4F@TqIV-N!#A"@E1LT3[1Q,
fKi9RB+%8l&&!ZQ(UH24db+M#HTiA5MDjHMZd9"rEqZ0p$aR`FjcdAPi0fS%%6jJ
##SJ24'm!JHIP,TcDVD3K&J%A52`mm5IY*1'(-8-Fc`fa6%Bp!#C'&eca)cf`)(R
U6q"iZZHh&)q1ZTR21SQ4KGF1LSH$CFA+6cj*kUaI%blB![L-4Md)*NBAa-Ra$`E
*Mip##Id(&iMflDcia+8RSC6)`[%(Z4$LbAPa&0TqL)5i2Fi'&6Z25-+jqL)6d*k
GC1%9Q(k)K,MMq'6V*`LX`r4%*'(`m1XR#eJ!G%3PaX,Jjb@4F@X@$SL%Z),M226
8LZ)r4%c&$2D)E1a!b)b+iMqaB5-2e'I&fDT&J"qNS5i2eF'3!%Gak4CSITq%p#H
Ra-p8Lc3rYb%Z$B!2D`Gak4CSIh6#HK23XSIbd8M&3IhY#e!5i2EBGak4MF2m$K2
3RSH3!-@15-H4rL%*b(rr*q#-M'iIi`#4KrXTmAMY#fkJ2)B-M3dh1-5fKEUL$b'
#HKTZDLiYe4"j$(cXE$b4',Lfl-2*%*F(NiIkFPYh`[dK%j2!j*pT`F!aL-YZN!"
2JS*&HVpfI*[S9aDTDNAQ2AEDUT!!63U)20abYKLI[e%*$VL@YB5I6H)S(A%rL
f+eK*rZQ*'(A%J8%#a46aQ&,mbmZPQipQN!!MT91+NFAB$VN(M!D#kP8kFZMG6(A
+cI)P83U4aI)1ZCqAM*8UQ(lpIBf138GdeS)k9+bT(&i`kk"am0"G5Tf1A4Ze$RT
(Z5Z-6#@'"8pM%`*l(A8d&e+Y8#k-d`kkhY*iP@XU4aI)1Z`C2&jGKD`TqYGSrjV
(ADh&eDb"qYGc-ZXmqm3LZakklh$lH@cKpEa5LXjkkdR!8XH(jcG3LXjkr28I"k0
bjY,U+j(Vmf",VSYd)V'HZVXIap(LUE5kM$3pG6`PdFI,R%JM$mpG3)R-H0CY,U+
e(VmT",SmGLT9QV@BAcI[5m%lI-Dq1Eq*Ui3NJQiZSGS6(b!1`8Ui3LLPkA,R,dN
M!4Y)kZ'")MLl!I)%eLPA#86dd#YVRDZEa3V8U88$X81PUecG-V8S4$R+iS1e+hL
G@PiRkM$Z&MdU03H#JN[#d821qq*8!b+L$Ldi*R#X`iflBeE0b9dYH8"!j,i&*ED
,["$(FD(Da!6"#&-60c-RJ*RYY)8"!"2j6TE5,[",0B(@5UJ&J8%lFc-Caal#5mH
JZLBpIHfQaQp&dJ#F&c8%iZN(HQD3!2JULImd[[EUCrFFkrRY3#U+#Gr@CMqb6Qk
49Fd[S`,T"pH3!1$&F@eCRNLf,f"pD6cHaP9R85rP0m[B(eC0BR9R95%ljG)2fZ2
bk4@JT,qX"G)22NiZk9SJ)28re*HipHfrVP9RX**-rd%XqAYXKbhLG@Ha+"Ji-Kr
[*aG)LJTGcR#241)h##PY&(blR-%6MUVEakLMjGcP#24Sq(HF+@d8I,ZFJ5f*E(,
Sqaac)TbAFi`R&RAH-4J&Y&8blR%%YLpKdNkc)Um5rl!1`'-0N!!R49-Pp+a%i`f
HBp@`Y-FcJqq)Bkh!YV9R-FcJPXBE,akY@Fac12Ij1MQb!HVBF'1E)4k1FKj&0DX
B[Z0G5QjDH@m6VB8LHF-AFMk[&iHcUdrTHL%BAMm*8#C@NT2bD#*amXcc(UY'LIK
6%HMiK4P38,DY0#IH0%YMh[hQ29DD%q-84k00!lr5KE9TS6hr",BPXFic`1+Y6LG
MBJNF4Sa'+&Y@VT1V`5f-0Ae*dV9dRXFL8aD129S8,DZ'j0q!LF5f2@'GjaT@X"B
rCF*a&XSd*1pTkf+TMa4#*a,EVI-T@a[-H0B4kirN!khc+9XEc(Md%YMrTYmhc&Y
e*Mm@K1,'f50kjY1+fl3aq4`RD2E4h5M(Sbe"-ikhA4j5M'!be"-a11Maf$e'+
TPU#(CU(4ii&Y'+TPU#%YMSmA$fT&Qc,8%1#FG(MJ@d@"-Y33PXG(MV**8L`*PU#
%iMSmF#fL`*PU#%YMSmH464A)be"$K`(VT,akM$Jbe-#24kfMb+DeB`k!"k`!cF2
ZF9V%"l83R%CcL&0&NaIF5-3Jr0TaHJL`JSp&k!r0Tp5+D%@2%pkJ[2Ep`)EZVD8
N@+N3m5L1X4S$Tk`5m3km1Zf,DS!k9%(&TakPd[idEUBU3CN9%"MS25BRr#bq1Pb
d($DpE41HMC-6[lbDZQh4S22BRr`PZEJaj&akq`#Fmd5BR(lIKbR1,T!!maLImT2
PdJm$6@*c`f6%iZN(Xc,`r2)H!5Ip0YcKQIhL2LVbH3kVTLFHr@08cLk4#lX1RS4
k154e`C6b)QKBG)`PXFKcF286V-2BNkFXSh2!%k*eQ(3m)R'jih$eDJTUrHSIc*(
X'Y`p@SJDY1c-HXprQH4%pM$5%2NhC3jEa1LHaKT#%6ZA8#FR53LJTKT#%HLFAcF
)K&(c$5%*EF[2bj[B44m`dK#24SqGGjbF)SqBD3K,BPXFZM@F88j-0)3R%I8fqSM
%96-0)3PYdH`q*1N9H-0)3qh`BE0!R49-`rD`41-0QiHVB@Q1C`G8!aeZ"E@V1Bj
R",B`fA$fDeCc(-ipriG(0N!p@`i-Ff3MdFj$b+DeBaIhqG35RP,(C-!6VB8LHR)
@hMk`'J2+drTFk$fl6$Ck0@m%+d@*qQPl!Pqr*8hKp@IK2d-$riBI"@makV3FRjX
4(V,ZMYiI9R`6rf82rdSrC#makV1NRcAL24U2I0bbK@HK2a[#8aUlHBp9RS6mE`M
ebRqFhC+cP*q'BHCjkGYGeXDS9RJ6r!X4k2F[I9d*DK9'8)5Q,4a@)$LY6mM+`*a
(ZYZF3Le@5-Y`P-FYcD29UXNCEK(SlrlQp"E$-MSf%TL`TcZl,BI%H``35kF,H8q
%J['kdKpfm9P3!NU)20U@8p2a@CY+Q1eQG)-c+L$kMZbRT55",4(Fl35fF33aK0S
R24iQ*fjZ$`%lfflXTjmLJPTflR,lQ"i,Qd6RQS6%lqphmCaR0dKjNNKkqpY4i$d
H!*V%jiH*LFA5$fE'N!$ji%%lpc6fDeGZSFQlD#$bqT!!)EGc)FJ6NfE-LAcb+lA
ic0$YUYSm4!8JmIrFd1q`l!mL$A'h4rM[40B(3[A'h4r$el(jE)L"hPEfaM%Xm!i
81N3hdP%hmP3AdHLACifiDYNZf[9mjBA5`f@5hm,2TEGiAbVekq&PppZlj5llYGK
8bl2'[ITdrRT)Kr*,c&4DLFZAeBmImJ!!!!)G1+d!!3!&FA9TBfX"!!!!#&4&@&4
,38K-Tc&rQUFaIj)"!,pi'U3!"J!!!C)!!0M9!!!"GJ!!&&!R5J!!:
- --
_____________________________________________________________________________
Michael A. Kelly Senior Partner
mkelly@cs.uoregon.edu High Risk Ventures
_____________________________________________________________________________
+++++++++++++++++++++++++++
From: Steve Christensen <stevec@apple.com>
Date: Fri, 20 Nov 1992 04:06:08 GMT
Organization: Apple Computer, Inc.
I took a pass at the code and here's what I came up with (changed lines
will have a * in the comment field).
MOVE.W h, D0 ; put height loop variable in D0
MOVEA.L src, A0 ; put the source pixmap address in
A0
MOVEA.L dst, A1 ; put the destination address in A1
MOVEA.L mask, A2 ; put the mask address in A2
CLR.L D2 ; clear the mask register
@1: ; copy the next row
MOVE.W w, D1
@2: ; copy the next eight bytes in the
row
MOVE.B (A2)+, D2 ; copy the next mask byte
BEQ.S @nocopy ;*if zero, don't copy anything
CMPI.B #0xFF, D2
BNE.S @hardway ;*don't copy everything
MOVE.L (A0)+, (A1)+ ; copy all bytes
MOVE.L (A0)+, (A1)+
DBF D1, @2
BRA.S @endloop ;*
@nocopy: ; copy no bytes
ADDQ.L #8, A0
ADDQ.L #8, A1
DBF D1, @2
BRA.S @endloop ;*
@hardway:
MOVEQ #0xF, D3 ;*mask off the lower nibble for
later
AND.B D2, D3 ;*
LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
ADD.W D2, D2 ; double the index
MOVE.W @table(D2.W), D2 ;*calculate the address
LEA @rts1, A3 ; save the return address
JMP @table(D2.W) ; plot four pixels
@rts1:
;*******MOVE.B -1(A2), D2 ; copy the next mask byte
;*******ANDI.B #0xF, D2 ; mask off the high four bits
ADD.W D2, D2 ; double the index
MOVE.W @table(D3.W), D2 ;*calculate the address
LEA @rts2, A3 ; save the return address
JMP @table(D2.W) ; plot four pixels
@rts2:
DBF D1, @2
@endloop:
MOVE.W e, D1 ;*
BLT.S @4 ;*continue if e is less than 0
MOVE.B (A2)+, D2 ; copy the next mask byte
;*******MOVE.W e, D1 ; initialize the loop counter
MOVEQ.L #7, D3 ; initialize the bit counter
@3: ; copy the next byte
BTST D3, D2 ; test the next bit in the mask
BEQ.S @skip ;*if zero, continue
MOVE.B (A0)+, (A1)+ ; else copy the pixel
SUBQ.L #1, D3 ; decrement the bit counter
DBF D1, @3
BRA.S @4 ;*
@skip:
ADDQ.L #1, A0
ADDQ.L #1, A1
SUBQ.L #1, D3 ; decrement the bit counter
DBF D1, @3
@4:
ADDA.L srcNewline, A0 ; bring the src pointer to the
start of the next row
ADDA.L dstNewline, A1 ; bring the dst pointer to the
start of the next row
DBF D0, @1
JMP @end ; skip to the end
@table:
DC.W @sub0-@table ;*
DC.W @sub1-@table ;*
DC.W @sub2-@table ;*
DC.W @sub3-@table ;*
DC.W @sub4-@table ;*
DC.W @sub5-@table ;*
DC.W @sub6-@table ;*
DC.W @sub7-@table ;*
DC.W @sub8-@table ;*
DC.W @sub9-@table ;*
DC.W @sub10-@table ;*
DC.W @sub11-@table ;*
DC.W @sub12-@table ;*
DC.W @sub13-@table ;*
DC.W @sub14-@table ;*
DC.W @sub15-@table ;*
@sub0: ; mask = 0000, draw nothing
ADDQ.L #4, A0
ADDQ.L #4, A1
JMP (A3) ; RTS
@sub1: ; mask = 0001
ADDQ.L #3, A0
ADDQ.L #3, A1
MOVE.B (A0)+, (A1)+
JMP (A3) ; RTS
@sub2: ; mask = 0010
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
JMP (A3) ; RTS
@sub3: ; mask = 0011
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.W (A0)+, (A1)+
JMP (A3) ; RTS
@sub4: ; mask = 0100
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
JMP (A3) ; RTS
@sub5: ; mask = 0101
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.B (A0)+, (A1)+
JMP (A3) ; RTS
@sub6: ; mask = 0110
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.W (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
JMP (A3) ; RTS
@sub7: ; mask = 0111
ADDQ.L #1, A0
ADDQ.L #1, A1
MOVE.B (A0)+, (A1)+
MOVE.W (A0)+, (A1)+
JMP (A3) ; RTS
@sub8: ; mask = 1000
MOVE.B (A0), (A1)
ADDQ.L #4, A0
ADDQ.L #4, A1
JMP (A3) ; RTS
@sub9: ; mask = 1001
MOVE.B (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
MOVE.B (A0)+, (A1)+
JMP (A3) ; RTS
@sub10: ; mask = 1010
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
JMP (A3) ; RTS
@sub11: ; mask = 1011
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
MOVE.W (A0)+, (A1)+
JMP (A3) ; RTS
@sub12: ; mask = 1100
MOVE.W (A0), (A1)
ADDQ.L #4, A0
ADDQ.L #4, A1
JMP (A3) ; RTS
@sub13: ; mask = 1101
MOVE.W (A0), (A1)
ADDQ.L #3, A0
ADDQ.L #3, A1
MOVE.B (A0)+, (A1)+
JMP (A3) ; RTS
@sub14: ; mask = 1110
MOVE.W (A0)+, (A1)+
MOVE.B (A0), (A1)
ADDQ.L #2, A0
ADDQ.L #2, A1
JMP (A3) ; RTS
@sub15: ; mask = 1111
MOVE.L (A0)+, (A1)+
JMP (A3) ; RTS
@end:
---------------------------
End of C.S.M.P. Digest
**********************